Cleaning

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import plotly.express as px
import numpy as np
import plotly.graph_objects as go

df = pd.read_csv("data/lightcast_job_postings.csv")

columns_to_keep = [
    'COMPANY', 'LOCATION', 'POSTED', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS_NAME',
    'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'TITLE', 'SKILLS',
    'SPECIALIZED_SKILLS', 'CERTIFICATIONS', 'COMMON_SKILLS', 'SOFTWARE_SKILLS',
    'SOC_2021_4_NAME', 'NAICS_2022_6', 'NAICS2_NAME', 'REMOTE_TYPE_NAME',
    'SALARY', 'TITLE_NAME', 'SKILLS_NAME', 'SPECIALIZED_SKILLS_NAME', 'BODY'
]
eda_data = df[columns_to_keep]
missing_matrix = eda_data.isnull().astype(int)
corr = missing_matrix.corr().round(2)

mask = np.triu(np.ones(corr.shape), k=1).astype(bool)
masked_corr = corr.mask(mask)

text_labels = masked_corr.astype(str)
text_labels[masked_corr.isna()] = ""

# plot
fig = go.Figure(data=go.Heatmap(
    z=masked_corr.values,
    x=masked_corr.columns,
    y=masked_corr.index,
    text=text_labels.values,
    texttemplate="%{text}",
    colorscale="Blues",
    colorbar=dict(title="Missing Corr"),
    zmin=0,
    zmax=1,
    hoverinfo='skip'
))

fig.update_layout(
    title="Clean Triangle Missing Value Correlation Heatmap",
    xaxis_tickangle=45,
    width=850,
    height=600,
    margin=dict(t=50, l=80, r=50, b=80),
    font=dict(size=8),
    plot_bgcolor='white'
)

fig.update_yaxes(autorange="reversed")
fig.show()

This triangle heatmap visualizes the correlation of missing values between different columns in the dataset. Each square represents how often two columns are missing together, with darker blue indicating a stronger relationship. Most of the values are very high (close to 1.0), suggesting that when one column is missing, others are often missing too — especially among skill-related fields like SKILLS, SPECIALIZED_SKILLS, and SOFTWARE_SKILLS, which are likely part of the same job posting metadata.

This pattern indicates that missingness is not random, but structured — possibly due to differences in how job descriptions are recorded across roles or industries. For example, a job with no software skill tags might also lack common skills or NAICS codes, hinting at data input gaps rather than actual job content differences. Recognizing these correlations is helpful for choosing imputation strategies or deciding whether to drop certain rows or columns entirely during preprocessing.

if "SALARY" in eda_data.columns:
    eda_data["SALARY"].fillna(eda_data["SALARY"].median(), inplace=True)
else:
    print("Warning: 'SALARY' column not found in dataframe!")

if "COMPANY" in eda_data.columns:
    eda_data["COMPANY"].fillna("Unknown", inplace=True)
else:
    print("Warning: 'COMPANY' column not found in dataframe!")

    # Fill numeric columns with mean
num_cols = eda_data.select_dtypes(include='number').columns
for col in num_cols:
    if eda_data[col].isnull().sum() > 0:
        eda_data[col].fillna(eda_data[col].mean(), inplace=True)

# Fill categorical columns with mode
cat_cols = eda_data.select_dtypes(include='object').columns
for col in cat_cols:
    if eda_data[col].isnull().sum() > 0:
        eda_data[col].fillna(eda_data[col].mode()[0], inplace=True)

eda_data.dropna(thresh=len(eda_data) * 0.5, axis=1, inplace=True)


# delete duplicates
eda_data = eda_data.drop_duplicates(subset=["TITLE", "COMPANY", "LOCATION", "POSTED","BODY"])
eda_data['BODY'] = eda_data['BODY'].str.slice(0, 1000)
eda_data['BODY'] = eda_data['BODY'].astype(str)
eda_data['COMPANY'] = eda_data['COMPANY'].astype(str)
import pandas as pd
eda_data.to_parquet('data/eda.parquet', engine='pyarrow', compression='gzip')
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/compat/_optional.py:135, in import_optional_dependency(name, extra, errors, min_version)
    134 try:
--> 135     module = importlib.import_module(name)
    136 except ImportError:

File /Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/importlib/__init__.py:88, in import_module(name, package)
     87         level += 1
---> 88 return _bootstrap._gcd_import(name[level:], package, level)

File <frozen importlib._bootstrap>:1387, in _gcd_import(name, package, level)

File <frozen importlib._bootstrap>:1360, in _find_and_load(name, import_)

File <frozen importlib._bootstrap>:1324, in _find_and_load_unlocked(name, import_)

ModuleNotFoundError: No module named 'pyarrow'

During handling of the above exception, another exception occurred:

ImportError                               Traceback (most recent call last)
Cell In[4], line 2
      1 import pandas as pd
----> 2 eda_data.to_parquet('data/eda.parquet', engine='pyarrow', compression='gzip')

File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/util/_decorators.py:333, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
    327 if len(args) > num_allow_args:
    328     warnings.warn(
    329         msg.format(arguments=_format_argument_list(allow_args)),
    330         FutureWarning,
    331         stacklevel=find_stack_level(),
    332     )
--> 333 return func(*args, **kwargs)

File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/core/frame.py:3113, in DataFrame.to_parquet(self, path, engine, compression, index, partition_cols, storage_options, **kwargs)
   3032 """
   3033 Write a DataFrame to the binary parquet format.
   3034 
   (...)
   3109 >>> content = f.read()
   3110 """
   3111 from pandas.io.parquet import to_parquet
-> 3113 return to_parquet(
   3114     self,
   3115     path,
   3116     engine,
   3117     compression=compression,
   3118     index=index,
   3119     partition_cols=partition_cols,
   3120     storage_options=storage_options,
   3121     **kwargs,
   3122 )

File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/io/parquet.py:476, in to_parquet(df, path, engine, compression, index, storage_options, partition_cols, filesystem, **kwargs)
    474 if isinstance(partition_cols, str):
    475     partition_cols = [partition_cols]
--> 476 impl = get_engine(engine)
    478 path_or_buf: FilePath | WriteBuffer[bytes] = io.BytesIO() if path is None else path
    480 impl.write(
    481     df,
    482     path_or_buf,
   (...)
    488     **kwargs,
    489 )

File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/io/parquet.py:78, in get_engine(engine)
     67     raise ImportError(
     68         "Unable to find a usable engine; "
     69         "tried using: 'pyarrow', 'fastparquet'.\n"
   (...)
     74         f"{error_msgs}"
     75     )
     77 if engine == "pyarrow":
---> 78     return PyArrowImpl()
     79 elif engine == "fastparquet":
     80     return FastParquetImpl()

File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/io/parquet.py:163, in PyArrowImpl.__init__(self)
    162 def __init__(self) -> None:
--> 163     import_optional_dependency(
    164         "pyarrow", extra="pyarrow is required for parquet support."
    165     )
    166     import pyarrow.parquet
    168     # import utils to register the pyarrow extension types

File ~/ad688-employability-sp25A1-group11-5/.venv/lib/python3.13/site-packages/pandas/compat/_optional.py:138, in import_optional_dependency(name, extra, errors, min_version)
    136 except ImportError:
    137     if errors == "raise":
--> 138         raise ImportError(msg)
    139     return None
    141 # Handle submodules: if we have submodule, grab parent module from sys.modules

ImportError: Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.